Assignment 8 Analysis

In this assignment, I walk through the data performing various analysis



In [2]:

    
# Import necessary libraries
% matplotlib inline

from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import *
import csv
from mpl_toolkits.mplot3d import Axes3D



In [3]:

    
# Read in the data
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()

rows = [[int(col) for col in row] for row in reader]

# Format the data sorted by physical location on slide
sorted_x = sorted(list(set([r[0] for r in rows])))
sorted_y = sorted(list(set([r[1] for r in rows])))
sorted_z = sorted(list(set([r[2] for r in rows])))



In [4]:

    
# Synapse data (as in rows from last code block) considering only unmasked synapses
unmaskedSynapses = ([r[-1] for r in rows if r[-2] != 0])

# Synapse data considering only unmasked synapses with non-zero density values
unmaskedSynapsesNoZero = ([r[-1] for r in rows if r[-2] != 0 if r[-1] !=0])

real_volume = np.zeros((len(sorted_x), len(sorted_y), len(sorted_z)))
for r in rows:
    real_volume[sorted_x.index(r[0]), sorted_y.index(r[1]), sorted_z.index(r[2])] = r[-1]

Kernal Density Estimation



In [5]:

    
unmaskedSynapses = np.asarray(unmaskedSynapses)
sns.kdeplot(unmaskedSynapses)









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x105dca950>

Examining xy density projections through image z layers

This didn't work how I wanted to, but there is more I can do here.



In [6]:

    
count = 0
for i in sorted_z:
    unmaskedSynapsesNoZero = ([r[-1] for r in rows if r[-2] != 0 if r[-1] !=0 if r[2] == i])
    # convert to pandas dataframe
    df = pd.DataFrame(unmaskedSynapsesNoZero, columns=['xy'])
    df.plot()
    count = count + 1
    
# df
    
#     syn_den = sns.load_dataset(df)









    Out[6]:






  
    
      
      xy
    
  
  
    
      0
      1
    
    
      1
      1
    
    
      2
      2
    
    
      3
      1
    
    
      4
      1
    
    
      5
      1
    
    
      6
      1
    
    
      7
      1
    
    
      8
      1
    
    
      9
      3
    
    
      10
      1
    
    
      11
      3
    
    
      12
      5
    
    
      13
      1
    
    
      14
      2
    
    
      15
      2
    
    
      16
      3
    
    
      17
      2
    
    
      18
      5
    
    
      19
      5
    
    
      20
      3
    
    
      21
      2
    
    
      22
      5
    
    
      23
      2
    
    
      24
      1
    
    
      25
      2
    
    
      26
      1
    
    
      27
      2
    
    
      28
      3
    
    
      29
      2
    
    
      ...
      ...
    
    
      4637
      99
    
    
      4638
      80
    
    
      4639
      84
    
    
      4640
      46
    
    
      4641
      21
    
    
      4642
      5
    
    
      4643
      67
    
    
      4644
      95
    
    
      4645
      55
    
    
      4646
      3
    
    
      4647
      1
    
    
      4648
      54
    
    
      4649
      108
    
    
      4650
      112
    
    
      4651
      46
    
    
      4652
      36
    
    
      4653
      55
    
    
      4654
      42
    
    
      4655
      1
    
    
      4656
      32
    
    
      4657
      16
    
    
      4658
      13
    
    
      4659
      11
    
    
      4660
      39
    
    
      4661
      47
    
    
      4662
      15
    
    
      4663
      22
    
    
      4664
      2
    
    
      4665
      1
    
    
      4666
      2
    
  

4667 rows × 1 columns

Single-layer heat map



In [7]:

    
count = 0
for i in sorted_z:
    df = pd.DataFrame(real_volume[:,:,count])
#     print real_volume[:,:,count]
#     ax = sns.heatmap(df)
    count = count + 1
    
ax = sns.heatmap(df, yticklabels=False, xticklabels=False)
    
#     syn_den = sns.load_dataset(df)









    



/Library/Python/2.7/site-packages/matplotlib/collections.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Cluster map



In [8]:

    
cmap = sns.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
g = sns.clustermap(df, cmap=cmap, linewidths=.5)

Cluster map with standardization across columns



In [9]:

    
# standardize data across all columns
g = sns.clustermap(df, standard_scale=1)

Analysis using Principal Components

PCA from single 2D Layer

Here we use a single z layer and find the principle components



In [10]:

    
pca = PCA(n_components=5)
pca.fit(df)
print(pca.explained_variance_ratio_)









    



[ 0.54632801  0.06806289  0.03887388  0.03446306  0.03072315]

Explained Variance



In [11]:

    
# make a square figure and axes
figure(1, figsize=(6,6))
ax = axes([0.1, 0.1, 0.8, 0.8])

# The slices will be ordered and plotted counter-clockwise.
labels = 'PC1', 'PC2', 'PC3', 'PC4', 'PC5'
fracs = pca.explained_variance_ratio_
print fracs

pie(fracs, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)

title('First 5 PC Explained Variance', bbox={'facecolor':'0.8', 'pad':5})

show()









    



[ 0.54632801  0.06806289  0.03887388  0.03446306  0.03072315]

We can see how the first five pricipal components explain variance in the 2D z-layers ("z" w.r.t. image, not brain)

PCA from single all three dimensions

We want to use PCA to go from 3D data --> 2D. This can help us visualize data, and will reorient our view hopefully in such a way that we can see brain layers.

First, let's see what the 3D data looks like:



In [12]:

    
ax.scatter(real, y, z, c='b')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-0cf5d7a370d7> in <module>()
----> 1 ax.scatter(real, y, z, c='b')

NameError: name 'y' is not defined



In [13]:

    
#Attempt to plot the entire data set in 3d in order to find some other characteristics?
#The plot does not look that great. Too dense. Needs adjustment
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

#print unmaskedSynapsesNoZero
x = []
y = []
z = []
val = []

for r in rows:
    if r[-2] != 0:
        if r[-1] !=0:
            x.append(r[0])
            y.append(r[1])
            z.append(r[2])
            
ax.scatter(x, y, z, c='b')
plt.show()

This plot is too messy. It also doesn't say much about the density at particular locations. To solve this, we need to do two things:

Make scatter marker size based on the density
Find a threshold and ignore the least dense points

I'll start by making a function that does these plots so we don't copy paste.



In [14]:

    
def make3DSynapsePlot(thresh):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    x = []
    y = []
    z = []
    d = []
    val = []

    for r in rows:
        if r[-2] != 0:
            # Filtering 
            if r[-1] > thresh:
                x.append(r[0])
                y.append(r[1])
                z.append(r[2])
                d.append(r[-1])

    ax.scatter(x, y, z, c='b')
    plt.show()

We know from our histogram that there is a density peak around 190. Let's start by filtering out points below this peak.



In [15]:

    
make3DSynapsePlot(190)

This is still too dense. Let's really crank up the threshold to 280. Note: In the future, I want to use standard deviation to determine this. While this would be easy on the raw data, we actually want to use the standard deviation on the second gaussian given by gmm. (@richie)



In [16]:

    
make3DSynapsePlot(280)

That's more like it! Now, let's add in some density information. We can base the bubble size by the density at that location.



In [85]:

    
def make3DSynapsePlot(thresh):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    x = []
    y = []
    z = []
    d = []
    val = []

    for r in rows:
        if r[-2] != 0:
            # Filtering 
            if r[-1] > thresh:
                x.append(r[0])
                y.append(r[1])
                z.append(r[2])
                d.append(r[-1])
                
    #Scale and raise it to appropriate power
    dens = [((i-min(d))**1.2) for i in d]

    ax.scatter(x, y, z, c='b', s=dens)
    fig.suptitle('3D Distribution of Synapses', fontsize=20)
    return ax



In [86]:

    
ax = make3DSynapsePlot(310)
ax.set_xlabel('x-axis', fontsize=16)
ax.set_ylabel('y-axis', fontsize=16)
ax.set_zlabel('z-axis', fontsize=16)
ax.view_init(elev=60, azim=125)
plt.show()



In [ ]:

    
def plot_figs(fig_num, elev, azim):
    fig = plt.figure(fig_num, figsize=(4, 3))
    plt.clf() #clear the figure
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=elev, azim=azim)

    ax.scatter(x[::10], y[::10], z[::10], c='b', marker='+', alpha=.4)
    Y = np.c_[x, y, z]

    # Using SciPy's SVD, this would be:
    # _, pca_score, V = scipy.linalg.svd(Y, full_matrices=False)

    pca = PCA(n_components=3)
    pca.fit(Y)
    pca_score = pca.explained_variance_ratio_
    V = pca.components_

    x_pca_axis, y_pca_axis, z_pca_axis = V.T * pca_score / pca_score.min()

    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V.T
    x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]]
    y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]]
    z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]]
    x_pca_plane.shape = (2, 2)
    y_pca_plane.shape = (2, 2)
    z_pca_plane.shape = (2, 2)
    ax.plot_surface(x_pca_plane, y_pca_plane, z_pca_plane)
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])


elev = -40
azim = -80
plot_figs(1, elev, azim)

# elev = 30
# azim = 20
# plot_figs(2, elev, azim)

plt.show()

Transforming data to PC



In [ ]:

    
pca.transform(df)



In [ ]:

    
# Plot the transformed data

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x = []
y = []
z = []
val = []

for r in rows:
    if r[-2] != 0:
        if r[-1] !=0:
            x.append(r[0])
            y.append(r[1])
            z.append(r[2])
            
ax.scatter(x, y, z, c='b')
plt.show()

	xy
0	1
1	1
2	2
3	1
4	1
5	1
6	1
7	1
8	1
9	3
10	1
11	3
12	5
13	1
14	2
15	2
16	3
17	2
18	5
19	5
20	3
21	2
22	5
23	2
24	1
25	2
26	1
27	2
28	3
29	2
...	...
4637	99
4638	80
4639	84
4640	46
4641	21
4642	5
4643	67
4644	95
4645	55
4646	3
4647	1
4648	54
4649	108
4650	112
4651	46
4652	36
4653	55
4654	42
4655	1
4656	32
4657	16
4658	13
4659	11
4660	39
4661	47
4662	15
4663	22
4664	2
4665	1
4666	2